#let us start by importing the relevant libraries
%matplotlib inline
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')
#import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,roc_auc_score
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
##Read the data as a dataframe
df = pd.read_csv('vehicle.csv')
#Perform basic EDA
df.head(20)
##Data type of each attribute
df.dtypes
## Shape of the data
df.shape
##Checking the presence of missing values
def missing_check(df):
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
return missing_data
missing_check(df)
##5 Point summary of numerical attributes
df.describe()
df.skew()
##Let's treat data for missing values first and then we can see the outliers
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
columns = df.columns
#Let's Label Encode our class variable:
print(columns)
df['class'] = le.fit_transform(df['class'])
df.shape
from sklearn.impute import SimpleImputer
newdf = df.copy()
X = newdf.iloc[:,0:19] #separting all numercial independent attribute
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
#fill missing values with mean column values
transformed_values = imputer.fit_transform(X)
column = X.columns
print(column)
newdf = pd.DataFrame(transformed_values, columns = column)
newdf.describe()
missing_check(newdf)
newdf.describe().T
##we got rid of all missing values let's see outliers
##Checking the presence of outliers and distribution
plt.style.use('seaborn-whitegrid')
newdf.hist(bins=20, figsize=(60,40), color='green', edgecolor = 'red')
plt.show()
## Observation
# Most of the data attributes seems to be normally distributed
# scaled valriance 1 and skewness about 1 and 2, scatter_ratio, seems to be right skewed.
# pr.axis_rectangularity seems to be haing outliers as there are some gaps found in the bar plot.
skewValue = newdf.skew()
print("skewValue of dataframe attributes: ", skewValue)
#univariant analysis using boxplot
sns.boxplot(data=newdf, orient="h")
newdf.boxplot(column=['pr.axis_aspect_ratio','skewness_about','scaled_variance'], figsize=(30,10))
newdf.boxplot(column=['radius_ratio','scaled_radius_of_gyration.1','scaled_variance.1'], figsize=(30,10))
newdf.boxplot(column=['max.length_aspect_ratio','skewness_about.1'], figsize=(30,10))
## Observation
# pr.axis_aspect_ratio, skewness_about, max_length_aspect_ratio, skewness_about_1,
# scaled_radius_of_gyration.1, scaled_variance.1, radius_ratio, skewness_about, scaled_variance.1
# are some of the attributes with outliers.
# Let's start Treating Outliers Using IQR: Upper whisker
newdf.shape
from scipy.stats import iqr
Q1 = newdf.quantile(0.25)
Q3 = newdf.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
np.where((newdf < (Q1 - 1.5 * IQR)) | (newdf > (Q3 + 1.5 * IQR)))
# we can use previously calculated IQR score to filter out the outliers by keeping only valid values.
newdf2 = newdf[~((newdf < (Q1 - 1.5 * IQR)) |(newdf > (Q3 + 1.5 * IQR))).any(axis=1)] # rows without outliers
newdf2.shape
# let's check outliers are removed by boxplot
newdf2.boxplot(column=['pr.axis_aspect_ratio','skewness_about', 'scaled_variance','radius_ratio', 'scaled_radius_of_gyration.1',
'scaled_variance.1', 'max.length_aspect_ratio', 'skewness_about.1'], figsize=(30,10))
#from above I can see only one outlier in scaled_variance.1
#that I can be considered because it's almost on the tip of the boxplot
## Let's undertsatnd the relationship between independent variables
def correlation_heatmap(dataframe,l,w):
#correlations = dataframe.corr()
correlation = dataframe.corr()
plt.figure(figsize=(l,w))
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='viridis')
plt.title('Correlation between different fearures')
plt.show();
# Let's Drop Class column and see the correlation Matrix & Pairplot Before using this dataframe for PCA as PCA should only be perfromed on independent attribute
cleandf= newdf2.drop('class', axis=1)
#print("After Dropping: ", cleandf)
correlation_heatmap(newdf2, 30,15)
# Strong relation
# - Scaled Variance & Scaled Variance.1 seems to be strongly correlated with value of 0.98
# - skewness_about_2 and hollow_ratio seems to be strongly correlated, coeff: 0.89
# - ditance_circularity and radius_ratio seems to have high positive correlation with corr coeff: 0.81
# - compactness & circularity , radius_ratio & pr.axis_aspect_ratio also seems ver averagely correlated with coeff: 0.67.
# - scaled _variance and scaled_radius_of_gyration, circularity & distance_circularity also seems to be highly correlated with corr coeff: 0.79
# - pr.axis_recatngularity and max.length_recatngularity also seems to be strongly correlated with coeff: 0.81
# - scatter_ratio and elongatedness seems to be have strong negative correlation val : 0.97
# - elongatedness and pr.axis_rectangularity seems to have strong negative correlation, val: 0.95
#No/little relation
# -max_length_aspect_ratio & radius_ratio have average correlation with coeff: 0.46
# - pr.axis_aspect_ratio & max_length_aspect_ratio seems to have very little correlation
# - scaled_radius_gyration & scaled_radisu_gyration.1 seems to be very little correlated
# - scaled_radius_gyration.1 & skewness_about seems to be very little correlated
# - skewness_about & skewness_about.1 not be correlated
# - skewness_about.1 and skewness_about.2 are not correlated
sns.pairplot(newdf2, diag_kind="kde")
# From above correlation matrix we can see that there are many features which are highly correlated.
#we will find that many features are there which having more than 0.9 correlation.
#so we can decide to get rid of those columns whose correlation is +-0.9 or above.There are 8 such columns:
# max.length_rectangularity
# scaled_radius_of_gyration
# skewness_about.2
# scatter_ratio
# elongatedness
# pr.axis_rectangularity
# scaled_variance
# scaled_variance.1
## use a dimension reduction algorithm such as Principle Component Analysis (PCA).
# We will go for PCA and analyse the same going forward
#Let's choose the right variable
#display how many are car,bus,van.
newdf2['class'].value_counts()
sns.countplot(newdf2['class'])
plt.show()
#. Split the data into train and test
X = newdf2.iloc[:,0:18].values
y = newdf2.iloc[:,18].values
X
# We transform (centralize) the entire X (independent variable data) to zscores through transformation. We will create the PCA dimensions
# on this distribution.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_std = sc.fit_transform(X)
cov_matrix = np.cov(X_std.T)
print('Covariance Matrix \n%s', cov_matrix)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
# Step 3 (continued): Sort eigenvalues in descending order
# Make a set of (eigenvalue, eigenvector) pairs
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()
eig_pairs.reverse()
print(eig_pairs)
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
tot = sum(eigenvalues)
var_explained = [(i / tot) for i in sorted(eigenvalues, reverse=True)] # an array of variance explained by each
# eigen vector... there will be 8 entries as there are 8 eigen vectors)
cum_var_exp = np.cumsum(var_explained) # an array of cumulative variance. There will be 8 entries with 8 th entry
# cumulative reaching almost 100%
plt.bar(range(1,19), var_explained, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),cum_var_exp, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
# Observation:
# - From above plot we can clealry observer that 8 dimensions are able to explain 95 %variance of data.
# - we will use 8 principal components going forward and calulate the reduced dimensions.
# Dimensionality Reduction
# Now 8 dimensions seems very reasonable.
#With 8 variables we can explain over 95% of the variation in the original data!
# P_reduce represents reduced mathematical space....
P_reduce = np.array(eigvectors_sorted[0:8]) # Reducing from 18 to 8 dimension space
X_std_8D = np.dot(X_std,P_reduce.T) # projecting original data into principal component dimensions
Proj_data_df = pd.DataFrame(X_std_8D) # converting array to dataframe for pairplot
Proj_data_df
#Let us check it visually
sns.pairplot(Proj_data_df, diag_kind='kde')
# After dimensionality reduction using PCA our attributes have become independent with no correlation among themselves.
# As most of them have cloud of data points with no lienaer kind of relationship.
# Let's Fit SVC Model ON Train-test Data:
# split the data
from sklearn import model_selection
test_size = 0.30 # taking 70:30 training and test set
seed = 7 # Random numbmer seeding for reapeatability of the code
#PCA Data
pca_X_train, pca_X_test, pca_y_train, pca_y_test = model_selection.train_test_split(Proj_data_df, y, test_size=test_size, random_state=seed)
#orginal Data
Orig_X_train,Orig_X_test,Orig_y_train,Orig_y_test = train_test_split(X_std,y,test_size=0.30,random_state=seed)
#let's check the split of data
print("{0:0.2f}% data is in training set".format((len(Orig_X_train)/len(Proj_data_df.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(Orig_X_test)/len(Proj_data_df.index)) * 100))
print(pca_X_test)
print(Orig_X_test)
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x]== predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
from sklearn import svm
svc = SVC() #instantiate the object
#fit the model on orighinal raw data
svc.fit(Orig_X_train,Orig_y_train)
#predict the y value
Orig_y_predict = svc.predict(Orig_X_test)
#now fit the model on pca data with new dimension
svc1 = SVC() #instantiate the object
svc1.fit(pca_X_train,pca_y_train)
#predict the y value
pca_y_predict = svc1.predict(pca_X_test)
print("Model Score On Original Data ",svc.score(Orig_X_test, Orig_y_test))
print("Model Score On Reduced PCA Dimension ",svc1.score(pca_X_test, pca_y_test))
print("Before PCA On Original 18 Dimension",accuracy_score(Orig_y_test,Orig_y_predict))
print("After PCA(On 8 dimension)",accuracy_score(pca_y_test,pca_y_predict))
pca_y_grid = (np.column_stack([pca_y_test, pca_y_predict]))
print(pca_y_grid)
np.savetxt("ocr.csv", pca_y_grid , fmt='%s')
import string
lab= list(string.ascii_uppercase[0:26])
plab=["Pr "+s for s in lab]
# Filter those cases where the model committed mistake and analyze the mistake,
# which characters most mistakes occured on?
# from sklearn import metrics
# print(metrics.confusion_matrix(pca_y_test, pca_y_predict,labels=[0, 1]))
# Calculate Confusion Matrix & PLot To Visualize it
def draw_confmatrix(y_test, yhat, str1, str2, str3, datatype ):
cm = confusion_matrix( y_test, yhat, [0,1,2] )
print("Confusion Matrix For :", "\n",datatype,cm )
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [str1, str2,str3] , yticklabels = [str1, str2,str3] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
draw_confmatrix(Orig_y_test, Orig_y_predict,"Van ", "Car ", "Bus", "Original Data Set" )
draw_confmatrix(pca_y_test, pca_y_predict,"Van ", "Car ", "Bus", "For Reduced Dimensions Using PCA ")
#Classification Report Of Model built on Raw Data
print("Classification Report For Raw Data:", "\n", classification_report(Orig_y_test,Orig_y_predict))
#Classification Report Of Model built on Principal Components:
print("Classification Report For PCA:","\n", classification_report(pca_y_test,pca_y_predict))
##### confusion matrix On original observation
# - Our model on original data set has correctly classified 63 van out of 63 actuals vans and has errored only in one case where it has wrongly predicted van to be a bus.
# - IN case of 127 actual cars our svm model has correcly classified 122 cars. it has wrongly classified 3 cars to be a bus and also 1 car to be a van
# - In case of 61 instances of actual bus , our model has correctly classified 53 buses , It has faltered in classifying wrongly 0 buses to be a van and one bus to be a car.
# on PCA
# - Out of 66 actual instances of vans our model has correctly predicted 62 vans and errored in 1 instances where it wrongly classified vans to be a car.
# - Out of 127 actuals cars , our mdoel has correclty classified 117 of them to be a car and faltered in 4 cases where it wrongly classified.
# - Out of 61 actual bus , our model has correclty classified 53 of them to be a bus. It has faltered in 6 cases where it wrongly classified.
# Let us build a linear regression model on the PCA dimensions
# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
# on pca
regression_model.fit(pca_X_train, pca_y_train)
regression_model.coef_
print("with pca data: intercept", regression_model.intercept_)
print("PCA score: ", regression_model.score(pca_X_test, pca_y_test))
# #on original
regression_model.fit(Orig_X_train, Orig_y_train)
print("with original data coef: ", regression_model.coef_)
print("with original data: intercept", regression_model.intercept_)
print("Original score: ", regression_model.score(Orig_X_test, Orig_y_test))
# Lessons -
# 1. Uses PCA only when the original dimensions have linear relations. The original dimensions had negative curvilinear relations
# 2. Remove outliers before doing PCA. We have significant outliers which are due to mix up of the gaussians in original dimension
# Suggestion -
# 1. Segment the original data based on observations using K Means clustering
# 2. Remove the outliers from the segments
# 2. If the original dimensions show strong linear relations in the segments, then apply PCA
#Observation:
# Model Score On Original Data 0.9631147540983607
# Model Score On Reduced PCA Dimension 0.9508196721311475
# Before PCA On Original 18 Dimension 0.9631147540983607
# After PCA(On 8 dimension) 0.9508196721311475
# On training data set we saw that our support vector classifier without performing PCA has an accuracy score of 95 %
# But when we applied the SVC model on PCA(reduced dimensions) our model scored 96 %.
# Considering that original dataframe had 18 dimensions and After PCA dimension reduced to 8,
#our model has fared well in terms of accuracy score. But it's in lesser dimension hence we can consider PCA here it really
#can afford the loss of data from those other dimensions.
# I tried the both PCA and original data on logistic regression as well. Hence observation there:
# PCA score: 0.5417950979991832
# Original score: 0.6786314628377199
# so model is performing very poor with both orioginal and pca hence we will not suggest to use logistic regression.
# I think SVM (SVC) is good way.